The purpose of this document is to generate differential expression data for the Klotho mice. Based on the analysis in 1.Klotho_Initial_Data_Visualization.Rmd, there isn’t much difference between the heterozygous and homozygous carriers of each variant. Thus, to get differential expression for downstream analysis, we will compare carriers to the WT mice.

Here we generate tables that will be used in a comparison to human differential expression. We will get differential expression values for each gene with the WT as the reference, so that a negative value means that the transcript has reduced abundance in the variant carrier relative to the WT.

rm(list = ls())

library(here)


args <- commandArgs(trailingOnly=T)
comparison.type <- args[1]
age.batch <- args[2]

if(is.na(comparison.type)){
    #comparison.type = "Hom" #compares homozygous FC/FC and VS/VS to WT/WT
    comparison.type = "Het" #compares homozygous WT/FC and WT/VS to WT/WT
    #comparison.type = "Carrier" #compares all carriers of VS and FC alleles to WT/WT
}

if(comparison.type == "Hom"){
    vs.allele <- "VS/VS"; fc.allele = "FC/FC"
}
if(comparison.type == "Het"){
    vs.allele <- "WT/VS"; fc.allele = "WT/FC"
}
if(comparison.type == "Carrier"){
    vs.allele <- "VS"; fc.allele = "FC"
}


if(is.na(age.batch)){
    age.batch <- 12
}

#subgroup <- NULL
subgroup <- list("age_batch" = as.numeric(age.batch))
#subgroup <- list("age_batch" = 12)
#subgroup <- list("age_batch" = 4, "sex_ge" = "Female") #example with more than one filter

subgroup.results <- paste(sapply(1:length(subgroup), 
function(x) paste(names(subgroup)[x], subgroup[[x]], sep = "_")), 
collapse = "_")
results.dir <- here("Results", subgroup.results)


processed.data.dir <- file.path(results.dir, "processed_data")
general.processed.data.dir <- here("Results", "Processed_Data")
general.data.dir <- here("Data", "general")

The analysis compares WT/FC mice and WT/VS mice to WT/WT mice. The age of the mice analyzed is 12 months.

all.fun <- list.files(here("Code"), full.names = TRUE)
for(i in 1:length(all.fun)){source(all.fun[i])}
needed.libraries <- c("synapser", "pheatmap", "DESeq2", "DT", "vioplot", "RColorBrewer",
  "gprofiler2", "cluster", "pathview", "clusterProfiler", "stringr", "igraph",
  "fgsea")
load_libraries(needed.libraries)

This table uses processed data from 1.Klotho_Initial_Data_Visualization.Rmd

info <- read.table(file.path(processed.data.dir, "mouse_info.csv"), sep = ",", header = TRUE)
raw.expr <- read.table(here("Data", "Mouse", "rsem.merged.gene_counts.tsv"), header = TRUE)
norm.expr <- readRDS(file.path(processed.data.dir, "Normalized_Expression.RDS"))
scaled.expr <- readRDS(file.path(processed.data.dir, "Scaled_Expression.RDS"))

Calculate differential expression for each genotype compared to WT.

gene_t_test <- function(x, y){
    varx <- var(x)
    vary <- var(y)
    if(varx > 0 && vary > 0){
        result <- t.test(x,y)
    }else{
        result <- NA
    }
    return(result)
}


geno_de <- function(variant = "VS", covar.table, expr.mat){
    var.idx <- grep(variant, covar.table[,"genotype"])
    #var.names <- covar.table[var.idx,"animalName"]
    wt.idx <- which(covar.table[,"genotype"] == "WT/WT")
    #wt.names <- covar.table[wt.idx,"animalName"]
    #boxplot(list("WT" = expr.mat[2,wt.idx], "Var" = expr.mat[2,var.idx]))
    sex.covar <- matrix(as.numeric(as.factor(covar.table[,"sex_ge"]))-1, ncol = 1)
    adj.expr <- adjust(t(expr.mat), sex.covar)
    test <- apply(adj.expr, 2, function(x) gene_t_test(x[wt.idx], x[var.idx]))
    test.info <- t(sapply(test, function(x) if(length(x) > 1){c(x$estimate, x$conf.int, x$p.value)}else{rep(NA, 5)}))
    colnames(test.info) <- c("Mean_WT", paste0("Mean_", variant), 
        "conf_int_wt", paste0("conv_int_", variant), "p")
    return(test.info)
}


#This function treats each SNP in the haplotype independently
#In humans there are two haplotypes with two SNPs. Either FC
#or VS. In mice the haplotype is recombined, and they have 
#the FS haplotype. This means we can count the number of F
#genotypes at the first position and the number of S genotypes
#at the second position. We will model the effects of the
#SNPs from the human protective haplotype (V and S). To do
#this, we create dummy variables for each SNP and code each
#position depending on the number of V and S alleles in each
#mouse.

dummy_geno <- function(genotypes){
    split.geno <- strsplit(genotypes, "/")
    numV <- numS <- rep(NA, length(genotypes))

    for(i in 1:length(split.geno)){
        split.geno[[i]][which(split.geno[[i]] == "WT")] <- "FS"
        split.alleles <- unlist(strsplit(split.geno[[i]], "", fixed = TRUE))
        numV[i] <- length(which(split.alleles == "V"))
        numS[i] <- length(which(split.alleles == "S"))
    }

    result <- cbind(numV, numS)
    colnames(result) <- c("V", "S")
    return(result)
}

gene_by_var <- function(covar.table, expr.mat){
    dummy.genotype <- dummy_geno(covar.table[,"genotype"])
    sex.covar <- matrix(as.numeric(as.factor(covar.table[,"sex_ge"]))-1, ncol = 1)
    adj.expr <- adjust(t(expr.mat), sex.covar)
    
    test.df <- data.frame("expr" = adj.expr[,1], "V" = dummy.genotype[,"V"], 
        "S" = dummy.genotype[,"S"], "Int" = dummy.genotype[,"V"]*dummy.genotype[,"S"]/2)
    test <- lm(expr~0+V+S+Int, data = test.df)

    test <- apply(adj.expr, 2, 
        function(x) lm(x~-1+dummy.genotype[,"V"]*dummy.genotype[,"S"]))
    test.coef <- t(sapply(test, function(x) x$coef[2:3]))
    p <- t(sapply(test, function(x) coef(summary(x))[,"Pr(>|t|)"][2:3]))
    f <- lapply(test, function(x) summary(x)$fstatistic)
    total.p <- sapply(f, function(x) pf(x[1],x[2],x[3],lower.tail=F))
    #qqunif.plot(p[,1])
    test.info <- cbind(test.coef, p, total.p)
    colnames(test.info) <- c("V_coef", "S_coef", "V_p", "S_p", "model_p")
    return(test.info)
}

QQ plots

The QQ plots for the DE p values are shown below. Not too impressive.

par(mfrow = c(1,2))
vs.diff.expr <- geno_de(vs.allele, info, scaled.expr) #can be VS, WT/VS, or VS/VS
qqunif.plot(vs.diff.expr[,"p"], plot.label = "WT vs VS")
#boxplot(vs.diff.expr[,1:2])
write.table(vs.diff.expr, file.path(processed.data.dir, 
    paste0(gsub("/", "_", vs.allele), "_DEG.csv")), sep = ",", quote = FALSE)

fc.diff.expr <- geno_de(fc.allele, info, scaled.expr) #can be FC, WT/FC, or FC/FC
qqunif.plot(fc.diff.expr[,"p"], plot.label = "WT vs. FC")

#boxplot(fc.diff.expr[,1:2])
write.table(fc.diff.expr, file.path(processed.data.dir, 
    paste0(gsub("/", "_", fc.allele), "_DEG.csv")), sep = ",", quote = FALSE)
#We tried looking at each allele separately by testing the effects of
#the number of V alleles and the number of S alleles in each mouse.
#but we can't get the interaction between them.


allele.test <- gene_by_var(info, norm.expr)

par(mfrow = c(1,3))
qqunif.plot(allele.test[,"V_p"], plot.label = "V allele")
qqunif.plot(allele.test[,"S_p"], plot.label = "S allele")
qqunif.plot(allele.test[,"model_p"], plot.label = "Overall Model")

Volcano Plots

Below are the volcano plots for each variant.

par(mfrow = c(1,2))
vs.de <- vs.diff.expr[,2] - vs.diff.expr[,1]
plot(vs.de, -log10(vs.diff.expr[,"p"]), main = paste(vs.allele, "vs. WT"), 
    ylab = "-log10(pval)", xlab = "Effect Size")

fc.de <- fc.diff.expr[,2] - fc.diff.expr[,1]
plot(fc.de, -log10(fc.diff.expr[,"p"]), main = paste(fc.allele, "vs. WT"), 
    ylab = "-log10(pval)", xlab = "Effect Size")

VS-FC comparison

The plots below show the VS-FC comparison

par(mfrow = c(1,2))
plot(-log10(vs.diff.expr[,"p"]), -log10(fc.diff.expr[,"p"]), 
    xlab = paste(vs.allele, "-log10(p)"), ylab = paste(fc.allele, "-log10(p)"))
plot.with.model(vs.de, fc.de, xlab = paste(vs.allele, "vs. WT"), 
    ylab = paste(fc.allele, "vs. WT"), report = "cor.test")

VS-FC enrichment

If we look at genes that are “significantly” differentially expressed in both VS and DE, we see a different picture.

alpha <- 0.05
sig.idx <- intersect(which(vs.diff.expr[,"p"] <= alpha), which(fc.diff.expr[,"p"] <= alpha))
plot(vs.de[sig.idx], fc.de[sig.idx], xlab = paste(vs.allele, "vs. WT"), 
    ylab = paste(fc.allele, "vs. WT"))
abline(h = 0, v = 0)

sig.vs <- vs.diff.expr[sig.idx,]
sig.fc <- fc.diff.expr[sig.idx,]

get_quadrants <- function(x, y, x.cutoff = 0, y.cutoff = 0){
    high.x.high.y <- intersect(which(x > x.cutoff), which(y > y.cutoff))
    high.x.low.y <- intersect(which(x > x.cutoff), which(y < y.cutoff))
    low.x.high.y <- intersect(which(x < x.cutoff), which(y > y.cutoff))
    low.x.low.y <- intersect(which(x < x.cutoff), which(y < y.cutoff))
    result <- list("xhigh_yhigh" = high.x.high.y, "xhigh_ylow" = high.x.low.y,
        "xlow_yhigh" = low.x.high.y, "xlow_ylow" = low.x.low.y)
    return(result)
}


quadrant.idx <- get_quadrants(vs.de[sig.idx], fc.de[sig.idx])
names(quadrant.idx) <- gsub("y", paste0(vs.allele, "_"), gsub("x", paste0(vs.allele, "_"), names(quadrant.idx)))
quadrant.enrich <- lapply(quadrant.idx, 
    function(x) gost(rownames(sig.vs)[x], organism = "mmusculus", 
    source = c("GO", "KEGG", "REACTOME", "HP", "CORUM")))
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
#terms <- plot.enrichment.group(quadrant.enrich, return.empty.columns = TRUE, max.term.size = 3000)


terms <- plot.enrichment.group(quadrant.enrich, plot.results = FALSE, 
    return.empty.columns = TRUE, max.term.size = 3000)

low_high_to_num <- function(vals, low.val = -0.5, high.val = 0.5){
    numV <- rep(NA, length(vals))
    numV[which(vals == "high")] <- high.val
    numV[which(vals == "low")] <- low.val
    return(numV)
}

split.names <- strsplit(colnames(terms), "_")
x.pos <- sapply(split.names, function(x) x[2])
y.pos <- sapply(split.names, function(x) x[4])
x.coord <- low_high_to_num(x.pos)
y.coord <- low_high_to_num(y.pos)


plot(0,1, xlim = c(-1, 1), ylim = c(-1, 1), type = "n",
    xlab = paste(vs.allele, "vs. WT"), ylab = paste(fc.allele, "vs. WT"), 
    main = "Enrichments by quadrant")
abline(h = 0, v = 0)
for(i in 1:length(x.pos)){
    sig.idx <- which(terms[,i] > 0)
    if(length(sig.idx) == 0){
        enrich.words = "No Enrichment"
    }else{
        enrich.words <- words_to_paragraph(names(sig.idx), line.len = 2)
    }
    plot.text(enrich.words, x = x.coord[i], y = y.coord[i], add = TRUE, cex = 0.7)   
}

GSEA

Instead of using a significance cutoff, we can use GSEA to look at enrichments for the genes that have positive and negative coefficients relative to wild type mice.

We can use several lists for comparison: Biodomains, KEGG pathways, and the intersection between biodomains and KEGG pathways. These were generated by 1.Klotho_Initial_Data_Visualiztion.Rmd and stored in Results/Processed_Data

bd.list <- readRDS(file.path(general.processed.data.dir, "Mouse_Biodomains_for_GSEA.RDS"))
bd.go.list <- readRDS(file.path(general.processed.data.dir, "Mouse_Biodomains_sub_GO_for_GSEA.RDS"))
kegg.list <- readRDS(file.path(general.processed.data.dir, "Mouse_KEGG_for_GSEA.RDS"))
intersect.list <- readRDS(file.path(general.processed.data.dir, "Mouse_KEGG_Intersections_for_GSEA.RDS"))


run_gsea <- function(vals, pos.neg = "pos", gsea.list){
    if(pos.neg == "pos"){
        idx <- which(vals > 0)
        idx.order <- order(vals[idx], decreasing = TRUE)
        sorted.vals <- vals[idx[idx.order]]
        #plot(sorted.vals)
    }
    if(pos.neg == "neg"){
        idx <- which(vals < 0)
        idx.order <- order(abs(vals[idx]), decreasing = TRUE)
        sorted.vals <- abs(vals[idx[idx.order]])
        #plot(sorted.vals)
    }

    gsea.enrich <- fgsea::fgsea(gsea.list, sorted.vals, scoreType = "pos")
    norm.es <- as.numeric(as.matrix(gsea.enrich[,"NES"]))
    pval <- as.numeric(as.matrix(gsea.enrich[,"padj"]))
    names(norm.es) <- names(pval) <- gsea.enrich$pathway
    result <- cbind(norm.es, pval)
    return(result)
}
var.de.list <- list("VS" = vs.de, "FC" = fc.de)
enrich.types <- c("pos", "neg")
combo.names <- paste(rep(names(var.de.list), each = length(enrich.types)), rep(enrich.types, length(var.de.list)), sep = "_")

bd.gsea <- vector(mode = "list", length = length(combo.names))
names(bd.gsea) <- combo.names

idx <- 1
for(v in 1:length(var.de.list)){
    for(tp in 1:length(enrich.types)){
        bd.gsea[[idx]] <- run_gsea(var.de.list[[v]], enrich.types[tp], bd.list)
        idx <- idx + 1
    }
}

par(mar = c(4,12,2,2), mfrow = c(2,2))
for(i in 1:length(bd.gsea)){
    barplot(sort(bd.gsea[[i]][,1]), las = 2, horiz = TRUE, main = combo.names[i])    
}

The heat map below shows the normalized enrichment score for each direction of each variant. The strongest enrichments were upregulated by the FC (common) variant and were tau homeostasis and synapse.

plot_nes <- function(nes.mat, color.scale = "blue",  col.text.rotation = 0, 
    col.text.adj = 0.5, col.text.shift = 0.05, row.text.shift = 0.18,
    mat.mar = c(4, 14, 2, 0), bar.mar = c(24, 3,4,4)){

    layout(matrix(c(1,2), nrow = 1), widths = c(1, 0.3))
    row.order <- hclust(dist(nes.mat))$order
    par(mar = mat.mar)

    mat.col <- imageWithText(nes.mat[row.order,], col.scale = color.scale, light.dark = "f", 
        col.text.rotation = col.text.rotation, col.text.adj = col.text.adj, 
        col.text.shift = col.text.shift, row.text.shift = row.text.shift)
    par(mar = bar.mar)
    bar.col = imageWithTextColorbar(nes.mat, col.scale = color.scale, light.dark = "f", 
        cex = 0.7, bar.lwd = 4, round.nearest = 0.1)
}
enrich.bd <- sapply(bd.gsea, function(x) x[,1])
plot_nes(enrich.bd, col.text.rotation = 15)

top.n <- 10

kegg.gsea <- vector(mode = "list", length = length(combo.names))
names(kegg.gsea) <- combo.names

idx <- 1
for(v in 1:length(var.de.list)){
    for(tp in 1:length(enrich.types)){
        kegg.gsea[[idx]] <- run_gsea(var.de.list[[v]], enrich.types[tp], kegg.list)
        idx <- idx + 1
    }
}

#par(mar = c(4,12,2,2), mfrow = c(2,2))
#for(i in 1:length(bd.gsea)){
#    barplot(sort(kegg.gsea[[i]][,1], decreasing = TRUE)[1:top.n], las = 2, horiz = TRUE, main = combo.names[i])    
#}

common.names <- Reduce("intersect", lapply(kegg.gsea, rownames))
enrich.kegg <- matrix(NA, nrow = length(common.names), ncol = 4)
rownames(enrich.kegg) <- common.names
colnames(enrich.kegg) <- c(paste(vs.allele, "up"), paste(vs.allele, "down"), paste(fc.allele, "up"), paste(fc.allele, "down"))
for(i in 1:length(kegg.gsea)){
    enrich.kegg[common.names,i] <- kegg.gsea[[i]][common.names, 1]
}

#filter to rows that have at least one value above a threshold
high.idx <- which(apply(enrich.kegg, 1, max) > 1.5)
#pdf("~/Desktop/gsea_kegg.pdf", width = 7, height = 12)
plot_nes(enrich.kegg[high.idx,])

#dev.off()

#pairs(enrich.kegg)
intersect.gsea <- vector(mode = "list", length = length(combo.names))
names(intersect.gsea) <- combo.names

idx <- 1
for(v in 1:length(var.de.list)){
    for(tp in 1:length(enrich.types)){
        test <- lapply(intersect.list, function(x) run_gsea(var.de.list[[v]], enrich.types[tp], x))
        nes <- sapply(test, function(x) x[,"norm.es"])
        intersect.gsea[[idx]] <- nes
        idx <- idx + 1
    }
}

par(mar = c(4,12,2,2), mfrow = c(2,2))
for(i in 1:length(intersect.gsea)){
    has.vals <- which(sapply(intersect.gsea[[i]], length) > 0)
    score.order <- order(sapply(intersect.gsea[[i]][has.vals], median), decreasing = FALSE)
    vioplot(intersect.gsea[[i]][has.vals[score.order]], las = 2, main = combo.names[i], horizontal = TRUE, col = "lightgray")
    stripchart(intersect.gsea[[i]][has.vals[score.order]], las = 2, main = combo.names[i], method = "jitter",
        pch = 16, col = "darkgray", add = TRUE) 
}

unlisted.gsea <- intersect.gsea
for(i in 1:length(unlisted.gsea)){
    unlisted.gsea[[i]] <- unlist(intersect.gsea[[i]], recursive = FALSE)
}
common.names <- Reduce("intersect", lapply(unlisted.gsea, names))
enrich.intersect <- matrix(NA, nrow = length(common.names), ncol = 4)
rownames(enrich.intersect) <- common.names
colnames(enrich.intersect) <- c(paste(vs.allele, "up"), paste(vs.allele, "down"), paste(fc.allele, "up"), paste(fc.allele, "down"))
for(i in 1:length(intersect.gsea)){
    enrich.intersect[common.names,i] <- unlisted.gsea[[i]][common.names]
}

#filter to rows that have at least one value above a threshold
high.idx <- which(apply(enrich.intersect, 1, max) > 1.9)
#length(high.idx)

#png("~/Desktop/gsea_intersect.png", width = 7, height = 22, units = "in", res = 300)
plot_nes(enrich.intersect[high.idx,], mat.mar = c(4, 22, 2, 0), col.text.rotation = 90,
 bar.mar = c(15, 3, 5, 4), col.text.shift = 0.01, col.text.adj = 1)

#dev.off()

The top terms from each group are shown below.

#pdf("~/Desktop/top_enrich.pdf", width = 9, height = 5)
for(i in 1:ncol(enrich.intersect)){
    top.terms <- sort(enrich.intersect[,i], decreasing = TRUE)[1:10]
    par(mar = c(4, 25, 2, 2))
    barplot(rev(top.terms), horiz = TRUE, las = 2, main = colnames(enrich.intersect)[i])
}

#dev.off()